From bbb0eb32255d1a3e97994d9c57ca5e3d50de4298 Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Mon, 18 Jul 2005 20:22:11 +0000 Subject: [PATCH] First cut of new time interfaces and synchronisation mechanisms. Based on an initial patch from Don Fry at IBM. Still TODO: 1. Testing 2. NTP synchronisation 3. Fix wallclock interface a bit 4. Support for platform timers other than PIT (e.g., HPET, IBM Cyclone) 5. Scale 64-bit TSC diffs instead of 32-bit, just for sanity 6. Error-correcting scale factor is still slightly wrong 6. More testing Signed-off-by: Keir Fraser --- .../arch/xen/i386/kernel/Makefile | 2 +- .../arch/xen/i386/kernel/time.c | 346 ++++++----- .../arch/xen/x86_64/kernel/Makefile | 2 +- xen/arch/x86/apic.c | 8 - xen/arch/x86/i8259.c | 8 +- xen/arch/x86/smpboot.c | 20 +- xen/arch/x86/time.c | 537 +++++++++++++----- xen/arch/x86/vmx_intercept.c | 6 +- xen/common/ac_timer.c | 2 +- xen/common/domain.c | 2 - xen/common/page_alloc.c | 18 +- xen/drivers/char/console.c | 2 - xen/include/asm-x86/time.h | 3 + xen/include/public/xen.h | 50 +- xen/include/xen/sched.h | 1 - xen/include/xen/time.h | 3 +- 16 files changed, 648 insertions(+), 362 deletions(-) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile index a559ddc54e..afaeeda479 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile @@ -19,7 +19,7 @@ c-obj-y := semaphore.o vm86.o \ s-obj-y := obj-y += cpu/ -obj-y += timers/ +#obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ #c-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o c-obj-$(CONFIG_MCA) += mca.o diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c index 0ca8d7eb64..38197e67a4 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c @@ -104,25 +104,17 @@ extern struct timer_opts timer_tsc; struct timer_opts *cur_timer = &timer_tsc; /* These are peridically updated in shared_info, and then copied here. */ -u32 shadow_tsc_stamp; -u64 shadow_system_time; -static u32 shadow_time_version; +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + u32 tsc_to_usec_mul; + int tsc_shift; + u32 version; +}; +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); static struct timeval shadow_tv; -/* - * We use this to ensure that gettimeofday() is monotonically increasing. We - * only break this guarantee if the wall clock jumps backwards "a long way". - */ -static struct timeval last_seen_tv = {0,0}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -/* Periodically propagate synchronised time base to the RTC and to Xen. */ -static long last_rtc_update, last_update_to_xen; -#endif - -/* Periodically take synchronised time base from Xen, if we need it. */ -static long last_update_from_xen; /* UTC seconds when last read Xen clock. */ - /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); @@ -164,26 +156,147 @@ __setup("independent_wallclock", __independent_wallclock); #define INDEPENDENT_WALLCLOCK() \ (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) +int tsc_disable __initdata = 0; + +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +struct timer_opts timer_tsc = { + .name = "tsc", + .delay = delay_tsc, +}; + +static inline u32 down_shift(u64 time, int shift) +{ + if ( shift < 0 ) + return (u32)(time >> -shift); + return (u32)((u32)time << shift); +} + +/* + * 32-bit multiplication of integer multiplicand and fractional multiplier + * yielding 32-bit integer product. + */ +static inline u32 mul_frac(u32 multiplicand, u32 multiplier) +{ + u32 product_int, product_frac; + __asm__ ( + "mul %3" + : "=a" (product_frac), "=d" (product_int) + : "0" (multiplicand), "r" (multiplier) ); + return product_int; +} + +void init_cpu_khz(void) +{ + u64 __cpu_khz = 1000000ULL << 32; + struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0]; + do_div(__cpu_khz, info->tsc_to_system_mul); + cpu_khz = down_shift(__cpu_khz, -info->tsc_shift); + printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_nsec_mul); +} + +static unsigned long get_usec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_usec_mul); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + long wtm_nsec; + time_t wtm_sec, sec; + s64 nsec; + + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_usec = s->wc_usec; + + if (INDEPENDENT_WALLCLOCK()) + return; + + if ((time_status & STA_UNSYNC) != 0) + return; + + /* Adjust shadow for jiffies that haven't updated xtime yet. */ + shadow_tv.tv_usec -= + (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); + HANDLE_USEC_UNDERFLOW(shadow_tv); + + /* Update our unsynchronised xtime appropriately. */ + sec = shadow_tv.tv_sec; + nsec = shadow_tv.tv_usec * NSEC_PER_USEC; + + __normalize_time(&sec, &nsec); + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); +} + /* * Reads a consistent set of time-base values from Xen, into a shadow data * area. Must be called with the xtime_lock held for writing. */ static void __get_time_values_from_xen(void) { - shared_info_t *s = HYPERVISOR_shared_info; + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); do { - shadow_time_version = s->time_version2; + dst->version = src->time_version2; rmb(); - shadow_tv.tv_sec = s->wc_sec; - shadow_tv.tv_usec = s->wc_usec; - shadow_tsc_stamp = (u32)s->tsc_timestamp; - shadow_system_time = s->system_time; + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; rmb(); } - while (shadow_time_version != s->time_version1); + while (dst->version != src->time_version1); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; - cur_timer->mark_offset(); + if ((shadow_tv.tv_sec != s->wc_sec) || + (shadow_tv.tv_usec != s->wc_usec)) + update_wallclock(); +} + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); + + return (dst->version == src->time_version2); } #define TIME_VALUES_UP_TO_DATE \ @@ -229,13 +342,18 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick; unsigned long flags; s64 nsec; + unsigned int cpu; + struct shadow_time_info *shadow; + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); do { unsigned long lost; seq = read_seqbegin(&xtime_lock); - usec = cur_timer->get_offset(); + usec = get_usec_offset(shadow); lost = jiffies - wall_jiffies; /* @@ -256,11 +374,11 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); - nsec = shadow_system_time - processed_system_time; + nsec = shadow->system_timestamp - processed_system_time; __normalize_time(&sec, &nsec); usec += (long)nsec / NSEC_PER_USEC; - if (unlikely(!TIME_VALUES_UP_TO_DATE)) { + if (unlikely(!time_values_up_to_date(cpu))) { /* * We may have blocked for a long time, * rendering our calculations invalid @@ -275,21 +393,13 @@ void do_gettimeofday(struct timeval *tv) } } while (read_seqretry(&xtime_lock, seq)); + put_cpu(); + while (usec >= USEC_PER_SEC) { usec -= USEC_PER_SEC; sec++; } - /* Ensure that time-of-day is monotonically increasing. */ - if ((sec < last_seen_tv.tv_sec) || - ((sec == last_seen_tv.tv_sec) && (usec < last_seen_tv.tv_usec))) { - sec = last_seen_tv.tv_sec; - usec = last_seen_tv.tv_usec; - } else { - last_seen_tv.tv_sec = sec; - last_seen_tv.tv_usec = usec; - } - tv->tv_sec = sec; tv->tv_usec = usec; } @@ -302,6 +412,8 @@ int do_settimeofday(struct timespec *tv) long wtm_nsec; s64 nsec; struct timespec xentime; + unsigned int cpu; + struct shadow_time_info *shadow; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -309,6 +421,9 @@ int do_settimeofday(struct timespec *tv) if (!INDEPENDENT_WALLCLOCK()) return 0; /* Silent failure? */ + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + write_seqlock_irq(&xtime_lock); /* @@ -317,9 +432,8 @@ int do_settimeofday(struct timespec *tv) * be stale, so we can retry with fresh ones. */ again: - nsec = (s64)tv->tv_nsec - - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); - if (unlikely(!TIME_VALUES_UP_TO_DATE)) { + nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow); + if (unlikely(!time_values_up_to_date(cpu))) { __get_time_values_from_xen(); goto again; } @@ -335,7 +449,7 @@ int do_settimeofday(struct timespec *tv) */ nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - nsec -= (shadow_system_time - processed_system_time); + nsec -= (shadow->system_timestamp - processed_system_time); __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); @@ -349,24 +463,21 @@ int do_settimeofday(struct timespec *tv) time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - /* Reset all our running time counts. They make no sense now. */ - last_seen_tv.tv_sec = 0; - last_update_from_xen = 0; - #ifdef CONFIG_XEN_PRIVILEGED_GUEST if (xen_start_info.flags & SIF_INITDOMAIN) { dom0_op_t op; - last_rtc_update = last_update_to_xen = 0; op.cmd = DOM0_SETTIME; op.u.settime.secs = xentime.tv_sec; op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC; - op.u.settime.system_time = shadow_system_time; + op.u.settime.system_time = shadow->system_timestamp; write_sequnlock_irq(&xtime_lock); HYPERVISOR_dom0_op(&op); } else #endif write_sequnlock_irq(&xtime_lock); + put_cpu(); + clock_was_set(); return 0; } @@ -403,10 +514,31 @@ static int set_rtc_mmss(unsigned long nowtime) */ unsigned long long monotonic_clock(void) { - return cur_timer->monotonic_clock(); + int cpu = get_cpu(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + s64 off; + unsigned long flags; + + for ( ; ; ) { + off = get_nsec_offset(shadow); + if (time_values_up_to_date(cpu)) + break; + write_seqlock_irqsave(&xtime_lock, flags); + __get_time_values_from_xen(); + write_sequnlock_irqrestore(&xtime_lock, flags); + } + + put_cpu(); + + return shadow->system_timestamp + off; } EXPORT_SYMBOL(monotonic_clock); +unsigned long long sched_clock(void) +{ + return monotonic_clock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -427,27 +559,26 @@ EXPORT_SYMBOL(profile_pc); static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - time_t wtm_sec, sec; - s64 delta, delta_cpu, nsec; - long sec_diff, wtm_nsec; + s64 delta, delta_cpu; int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); do { __get_time_values_from_xen(); - delta = delta_cpu = (s64)shadow_system_time + - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); delta -= processed_system_time; delta_cpu -= per_cpu(processed_system_time, cpu); } - while (!TIME_VALUES_UP_TO_DATE); + while (!time_values_up_to_date(cpu)); if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { printk("Timer ISR/%d: Time went backwards: " "delta=%lld cpu_delta=%lld shadow=%lld " "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow_system_time, - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), processed_system_time, per_cpu(processed_system_time, cpu)); for (cpu = 0; cpu < num_online_cpus(); cpu++) @@ -470,76 +601,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING, regs); } - - if (cpu != 0) - return; - - /* - * Take synchronised time from Xen once a minute if we're not - * synchronised ourselves, and we haven't chosen to keep an independent - * time base. - */ - if (!INDEPENDENT_WALLCLOCK() && - ((time_status & STA_UNSYNC) != 0) && - (xtime.tv_sec > (last_update_from_xen + 60))) { - /* Adjust shadow for jiffies that haven't updated xtime yet. */ - shadow_tv.tv_usec -= - (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); - HANDLE_USEC_UNDERFLOW(shadow_tv); - - /* - * Reset our running time counts if they are invalidated by - * a warp backwards of more than 500ms. - */ - sec_diff = xtime.tv_sec - shadow_tv.tv_sec; - if (unlikely(abs(sec_diff) > 1) || - unlikely(((sec_diff * USEC_PER_SEC) + - (xtime.tv_nsec / NSEC_PER_USEC) - - shadow_tv.tv_usec) > 500000)) { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - last_rtc_update = last_update_to_xen = 0; -#endif - last_seen_tv.tv_sec = 0; - } - - /* Update our unsynchronised xtime appropriately. */ - sec = shadow_tv.tv_sec; - nsec = shadow_tv.tv_usec * NSEC_PER_USEC; - - __normalize_time(&sec, &nsec); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - last_update_from_xen = sec; - } - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (!(xen_start_info.flags & SIF_INITDOMAIN)) - return; - - /* Send synchronised time to Xen approximately every minute. */ - if (((time_status & STA_UNSYNC) == 0) && - (xtime.tv_sec > (last_update_to_xen + 60))) { - dom0_op_t op; - struct timeval tv; - - tv.tv_sec = xtime.tv_sec; - tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; - tv.tv_usec += (jiffies - wall_jiffies) * (USEC_PER_SEC/HZ); - HANDLE_USEC_OVERFLOW(tv); - - op.cmd = DOM0_SETTIME; - op.u.settime.secs = tv.tv_sec; - op.u.settime.usecs = tv.tv_usec; - op.u.settime.system_time = shadow_system_time; - HYPERVISOR_dom0_op(&op); - - last_update_to_xen = xtime.tv_sec; - } -#endif } /* @@ -731,12 +792,10 @@ void __init time_init(void) xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - processed_system_time = shadow_system_time; + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; - if (timer_tsc_init.init(NULL) != 0) - BUG(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + init_cpu_khz(); #if defined(__x86_64__) vxtime.mode = VXTIME_TSC; @@ -807,21 +866,15 @@ void time_suspend(void) /* No locking required. We are only CPU running, and interrupts are off. */ void time_resume(void) { - if (timer_tsc_init.init(NULL) != 0) - BUG(); + init_cpu_khz(); /* Get timebases for new environment. */ __get_time_values_from_xen(); /* Reset our own concept of passage of system time. */ - processed_system_time = shadow_system_time; + processed_system_time = + per_cpu(shadow_time, smp_processor_id()).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; - - /* Accept a warp in UTC (wall-clock) time. */ - last_seen_tv.tv_sec = 0; - - /* Make sure we resync UTC time with Xen on next timer interrupt. */ - last_update_from_xen = 0; } #ifdef CONFIG_SMP @@ -832,7 +885,8 @@ void local_setup_timer(void) do { seq = read_seqbegin(&xtime_lock); - per_cpu(processed_system_time, cpu) = shadow_system_time; + per_cpu(processed_system_time, cpu) = + per_cpu(shadow_time, cpu).system_timestamp; } while (read_seqretry(&xtime_lock, seq)); per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); @@ -861,3 +915,13 @@ static int __init xen_sysctl_init(void) return 0; } __initcall(xen_sysctl_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile index ed4d0156f0..27b28886b3 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -15,7 +15,7 @@ c-obj-y := semaphore.o i387.o sys_x86_64.o \ ptrace.o quirks.o syscall.o bootflag.o i386-obj-y := time.o -obj-y += ../../i386/kernel/timers/ +#obj-y += ../../i386/kernel/timers/ s-obj-y := diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c index 50497c55c7..7237bef788 100644 --- a/xen/arch/x86/apic.c +++ b/xen/arch/x86/apic.c @@ -723,16 +723,8 @@ void __setup_APIC_LVTT(unsigned int clocks) static void __init setup_APIC_timer(unsigned int clocks) { unsigned long flags; - local_irq_save(flags); - - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); - __setup_APIC_LVTT(clocks); - local_irq_restore(flags); } diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c index 7fc9340d78..38fb41c467 100644 --- a/xen/arch/x86/i8259.c +++ b/xen/arch/x86/i8259.c @@ -19,7 +19,7 @@ #include #include #include - +#include /* * Common place to define all x86 IRQ vectors @@ -395,9 +395,9 @@ void __init init_IRQ(void) /* Set the clock to HZ Hz */ #define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */ #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ) - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff , 0x40); /* LSB */ - outb(LATCH >> 8 , 0x40); /* MSB */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + outb(LATCH >> 8, PIT_CH0); /* MSB */ setup_irq(2, &cascade); } diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index c9e1ac9151..392c6f1abb 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -406,6 +407,7 @@ void __init smp_callin(void) */ if (cpu_has_tsc && cpu_khz) synchronize_tsc_ap(); + calibrate_tsc_ap(); } int cpucount; @@ -465,6 +467,8 @@ void __init start_secondary(void *unused) /* We can take interrupts now: we're officially "up". */ local_irq_enable(); + init_percpu_time(); + wmb(); startup_cpu_idle_loop(); } @@ -1149,6 +1153,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) */ if (cpu_has_tsc && cpucount && cpu_khz) synchronize_tsc_bp(); + calibrate_tsc_bp(); } /* These are wrappers to interface to the new boot process. Someone @@ -1167,22 +1172,21 @@ void __devinit smp_prepare_boot_cpu(void) int __devinit __cpu_up(unsigned int cpu) { /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); + if (cpu_isset(cpu, smp_commenced_mask)) return -ENOSYS; - } /* In case one didn't come up */ - if (!cpu_isset(cpu, cpu_callin_map)) { - local_irq_enable(); + if (!cpu_isset(cpu, cpu_callin_map)) return -EIO; - } - local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); - while (!cpu_isset(cpu, cpu_online_map)) + while (!cpu_isset(cpu, cpu_online_map)) { mb(); + if (softirq_pending(0)) + do_softirq(); + } + return 0; } diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 5cd738bcde..9df26ca63a 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -1,16 +1,12 @@ -/**************************************************************************** - * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge - * (C) 2002-2003 University of Cambridge - **************************************************************************** - * - * File: i386/time.c - * Author: Rolf Neugebar & Keir Fraser - */ - -/* - * linux/arch/i386/kernel/time.c - * - * Copyright (C) 1991, 1992, 1995 Linus Torvalds +/****************************************************************************** + * arch/x86/time.c + * + * Per-CPU time calibration and management. + * + * Copyright (c) 2002-2005, K A Fraser + * + * Portions from Linux are: + * Copyright (c) 1991, 1992, 1995 Linus Torvalds */ #include @@ -31,29 +27,74 @@ #include #include #include +#include +#include -/* GLOBAL */ unsigned long cpu_khz; /* CPU clock frequency in kHz. */ spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; int timer_ack = 0; unsigned long volatile jiffies; +static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */ + +struct time_scale { + int shift; + u32 mul_frac; +}; + +struct cpu_time { + u64 local_tsc_stamp; + s_time_t stime_local_stamp; + s_time_t stime_master_stamp; + struct time_scale tsc_scale; + struct ac_timer calibration_timer; +} __cacheline_aligned; + +static struct cpu_time cpu_time[NR_CPUS]; + +/* Protected by platform_timer_lock. */ +static s_time_t stime_platform_stamp; +static u64 platform_timer_stamp; +static struct time_scale platform_timer_scale; +static spinlock_t platform_timer_lock = SPIN_LOCK_UNLOCKED; + +static inline u32 down_shift(u64 time, int shift) +{ + if ( shift < 0 ) + return (u32)(time >> -shift); + return (u32)((u32)time << shift); +} -/* PRIVATE */ -static unsigned int rdtsc_bitshift; /* Which 32 bits of TSC do we use? */ -static u64 cpu_freq; /* CPU frequency (Hz) */ -static u32 st_scale_f; /* Cycles -> ns, fractional part */ -static u32 st_scale_i; /* Cycles -> ns, integer part */ -static u32 shifted_tsc_irq; /* CPU0's TSC at last 'time update' */ -static u64 full_tsc_irq; /* ...ditto, but all 64 bits */ -static s_time_t stime_irq; /* System time at last 'time update' */ -static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */ -static rwlock_t time_lock = RW_LOCK_UNLOCKED; +/* + * 32-bit division of integer dividend and integer divisor yielding + * 32-bit fractional quotient. + */ +static inline u32 div_frac(u32 dividend, u32 divisor) +{ + u32 quotient, remainder; + ASSERT(dividend < divisor); + __asm__ ( + "div %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} -void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) +/* + * 32-bit multiplication of integer multiplicand and fractional multiplier + * yielding 32-bit integer product. + */ +static inline u32 mul_frac(u32 multiplicand, u32 multiplier) { - write_lock_irq(&time_lock); + u32 product_int, product_frac; + __asm__ ( + "mul %3" + : "=a" (product_frac), "=d" (product_int) + : "0" (multiplicand), "r" (multiplier) ); + return product_int; +} -#ifdef CONFIG_X86_IO_APIC +void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) +{ if ( timer_ack ) { extern spinlock_t i8259A_lock; @@ -63,31 +104,10 @@ void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) inb(0x20); spin_unlock(&i8259A_lock); } -#endif - /* - * Updates TSC timestamp (used to interpolate passage of time between - * interrupts). - */ - rdtscll(full_tsc_irq); - shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift); - /* Update jiffies counter. */ (*(unsigned long *)&jiffies)++; - /* Update wall time. */ - wc_usec += 1000000/HZ; - if ( wc_usec >= 1000000 ) - { - wc_usec -= 1000000; - wc_sec++; - } - - /* Updates system time (nanoseconds since boot). */ - stime_irq += MILLISECS(1000/HZ); - - write_unlock_irq(&time_lock); - /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ if ( !cpu_has_apic ) raise_softirq(AC_TIMER_SOFTIRQ); @@ -103,9 +123,9 @@ static struct irqaction irq0 = { timer_interrupt, "timer", NULL}; #define CALIBRATE_FRAC 20 /* calibrate over 50ms */ #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC) -static unsigned long __init calibrate_tsc(void) +static u64 calibrate_boot_tsc(void) { - u64 start, end, diff; + u64 start, end; unsigned long count; /* Set the Gate high, disable speaker */ @@ -118,9 +138,9 @@ static unsigned long __init calibrate_tsc(void) * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB) * to begin countdown. */ - outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ - outb(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ - outb(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ + outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */ + outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */ + outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */ rdtscll(start); for ( count = 0; (inb(0x61) & 0x20) == 0; count++ ) @@ -131,15 +151,147 @@ static unsigned long __init calibrate_tsc(void) if ( count == 0 ) return 0; - diff = end - start; + return ((end - start) * (u64)CALIBRATE_FRAC); +} -#if defined(__i386__) - /* If quotient doesn't fit in 32 bits then we return error (zero). */ - if ( diff & ~0xffffffffULL ) - return 0; -#endif +static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec) +{ + u64 tps64 = ticks_per_sec; + u32 tps32; + int shift = 0; + + while ( tps64 > (MILLISECS(1000)*2) ) + { + tps64 >>= 1; + shift--; + } + + tps32 = (u32)tps64; + while ( tps32 < (u32)MILLISECS(1000) ) + { + tps32 <<= 1; + shift++; + } + + ts->mul_frac = div_frac(MILLISECS(1000), tps32); + ts->shift = shift; +} + +static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0); +static unsigned int tsc_calibrate_status = 0; + +void calibrate_tsc_bp(void) +{ + while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) ) + mb(); + + outb(CALIBRATE_LATCH & 0xff, PIT_CH2); + outb(CALIBRATE_LATCH >> 8, PIT_CH2); + + tsc_calibrate_status = 1; + wmb(); + + while ( (inb(0x61) & 0x20) == 0 ) + continue; + + tsc_calibrate_status = 2; + wmb(); + + while ( atomic_read(&tsc_calibrate_gang) != 0 ) + mb(); +} + +void calibrate_tsc_ap(void) +{ + u64 t1, t2, ticks_per_sec; + + atomic_inc(&tsc_calibrate_gang); + + while ( tsc_calibrate_status < 1 ) + mb(); + + rdtscll(t1); + + while ( tsc_calibrate_status < 2 ) + mb(); + + rdtscll(t2); + + ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC; + set_time_scale(&cpu_time[smp_processor_id()].tsc_scale, ticks_per_sec); + + atomic_dec(&tsc_calibrate_gang); +} + +/* Protected by platform_timer_lock. */ +static u64 platform_pit_counter; +static u16 pit_stamp; +static struct ac_timer pit_overflow_timer; + +static u16 pit_read_counter(void) +{ + u16 count; + ASSERT(spin_is_locked(&platform_timer_lock)); + outb(0x80, PIT_MODE); + count = inb(PIT_CH2); + count |= inb(PIT_CH2) << 8; + return count; +} + +static void pit_overflow(void *unused) +{ + u16 counter; + + spin_lock(&platform_timer_lock); + counter = pit_read_counter(); + platform_pit_counter += (u16)(pit_stamp - counter); + pit_stamp = counter; + spin_unlock(&platform_timer_lock); + + set_ac_timer(&pit_overflow_timer, NOW() + MILLISECS(20)); +} + +static void init_platform_timer(void) +{ + init_ac_timer(&pit_overflow_timer, pit_overflow, NULL, 0); + pit_overflow(NULL); + platform_timer_stamp = platform_pit_counter; + set_time_scale(&platform_timer_scale, CLOCK_TICK_RATE); +} + +static s_time_t __read_platform_stime(u64 platform_time) +{ + u64 diff64 = platform_time - platform_timer_stamp; + u32 diff = down_shift(diff64, platform_timer_scale.shift); + ASSERT(spin_is_locked(&platform_timer_lock)); + return (stime_platform_stamp + + (u64)mul_frac(diff, platform_timer_scale.mul_frac)); +} + +static s_time_t read_platform_stime(void) +{ + u64 counter; + s_time_t stime; - return (unsigned long)diff; + spin_lock(&platform_timer_lock); + counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter()); + stime = __read_platform_stime(counter); + spin_unlock(&platform_timer_lock); + + return stime; +} + +static void platform_time_calibration(void) +{ + u64 counter; + s_time_t stamp; + + spin_lock(&platform_timer_lock); + counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter()); + stamp = __read_platform_stime(counter); + stime_platform_stamp = stamp; + platform_timer_stamp = counter; + spin_unlock(&platform_timer_lock); } @@ -233,141 +385,215 @@ static unsigned long get_cmos_time(void) * System Time ***************************************************************************/ -static inline u64 get_time_delta(void) -{ - s32 delta_tsc; - u32 low; - u64 delta, tsc; - - ASSERT(st_scale_f || st_scale_i); - - rdtscll(tsc); - low = (u32)(tsc >> rdtsc_bitshift); - delta_tsc = (s32)(low - shifted_tsc_irq); - if ( unlikely(delta_tsc < 0) ) delta_tsc = 0; - delta = ((u64)delta_tsc * st_scale_f); - delta >>= 32; - delta += ((u64)delta_tsc * st_scale_i); - - return delta; -} - s_time_t get_s_time(void) { + struct cpu_time *t = &cpu_time[smp_processor_id()]; + u64 tsc; + u32 delta; s_time_t now; - unsigned long flags; - - read_lock_irqsave(&time_lock, flags); - - now = stime_irq + get_time_delta(); - /* Ensure that the returned system time is monotonically increasing. */ - { - static s_time_t prev_now = 0; - if ( unlikely(now < prev_now) ) - now = prev_now; - prev_now = now; - } - - read_unlock_irqrestore(&time_lock, flags); + rdtscll(tsc); + delta = down_shift(tsc - t->local_tsc_stamp, t->tsc_scale.shift); + now = t->stime_local_stamp + (u64)mul_frac(delta, t->tsc_scale.mul_frac); - return now; + return now; } static inline void __update_dom_time(struct vcpu *v) { - struct domain *d = v->domain; - shared_info_t *si = d->shared_info; + struct cpu_time *t = &cpu_time[smp_processor_id()]; + struct vcpu_time_info *u = &v->domain->shared_info->vcpu_time[v->vcpu_id]; - spin_lock(&d->time_lock); - - si->time_version1++; + u->time_version1++; wmb(); - si->cpu_freq = cpu_freq; - si->tsc_timestamp = full_tsc_irq; - si->system_time = stime_irq; - si->wc_sec = wc_sec; - si->wc_usec = wc_usec; + u->tsc_timestamp = t->local_tsc_stamp; + u->system_time = t->stime_local_stamp; + u->tsc_to_system_mul = t->tsc_scale.mul_frac; + u->tsc_shift = (s8)t->tsc_scale.shift; wmb(); - si->time_version2++; + u->time_version2++; - spin_unlock(&d->time_lock); + /* Should only do this during do_settime(). */ + v->domain->shared_info->wc_sec = wc_sec; + v->domain->shared_info->wc_usec = wc_usec; } void update_dom_time(struct vcpu *v) { - unsigned long flags; - - if ( v->domain->shared_info->tsc_timestamp != full_tsc_irq ) - { - read_lock_irqsave(&time_lock, flags); + if ( v->domain->shared_info->vcpu_time[v->vcpu_id].tsc_timestamp != + cpu_time[smp_processor_id()].local_tsc_stamp ) __update_dom_time(v); - read_unlock_irqrestore(&time_lock, flags); - } } /* Set clock to after 00:00:00 UTC, 1 January, 1970. */ void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base) { - s64 delta; - long _usecs = (long)usecs; + u64 x, base_usecs; + u32 y; + + base_usecs = system_time_base; + do_div(base_usecs, 1000); + + x = (secs * 1000000ULL) + (u64)usecs + base_usecs; + y = do_div(x, 1000000); - write_lock_irq(&time_lock); + wc_sec = (unsigned long)x; + wc_usec = (unsigned long)y; - delta = (s64)(stime_irq - system_time_base); + __update_dom_time(current); +} - _usecs += (long)(delta/1000); - while ( _usecs >= 1000000 ) +static void local_time_calibration(void *unused) +{ + unsigned int cpu = smp_processor_id(); + + /* + * System timestamps, extrapolated from local and master oscillators, + * taken during this calibration and the previous calibration. + */ + s_time_t prev_local_stime, curr_local_stime; + s_time_t prev_master_stime, curr_master_stime; + + /* TSC timestamps taken during this calibration and prev calibration. */ + u64 prev_tsc, curr_tsc; + + /* + * System time and TSC ticks elapsed during the previous calibration + * 'epoch'. Also the accumulated error in the local estimate. All these + * values end up down-shifted to fit in 32 bits. + */ + u64 stime_elapsed64, tsc_elapsed64, local_stime_error64; + u32 stime_elapsed32, tsc_elapsed32, local_stime_error32; + + /* Calculated TSC shift to ensure 32-bit scale multiplier. */ + int tsc_shift = 0; + + prev_tsc = cpu_time[cpu].local_tsc_stamp; + prev_local_stime = cpu_time[cpu].stime_local_stamp; + prev_master_stime = cpu_time[cpu].stime_master_stamp; + + /* Disable IRQs to get 'instantaneous' current timestamps. */ + local_irq_disable(); + rdtscll(curr_tsc); + curr_local_stime = get_s_time(); + curr_master_stime = read_platform_stime(); + local_irq_enable(); + +#if 0 + printk("PRE%d: tsc=%lld stime=%lld master=%lld\n", + cpu, prev_tsc, prev_local_stime, prev_master_stime); + printk("CUR%d: tsc=%lld stime=%lld master=%lld %lld\n", + cpu, curr_tsc, curr_local_stime, curr_master_stime, + platform_pit_counter); +#endif + + /* Local time warps forward if it lags behind master time. */ + if ( curr_local_stime < curr_master_stime ) + curr_local_stime = curr_master_stime; + + stime_elapsed64 = curr_master_stime - prev_master_stime; + tsc_elapsed64 = curr_tsc - prev_tsc; + + /* + * Error in the local system time estimate. Clamp to epoch time period, or + * we could end up with a negative scale factor (time going backwards!). + * This effectively clamps the scale factor to >= 0. + */ + local_stime_error64 = curr_local_stime - curr_master_stime; + if ( local_stime_error64 > stime_elapsed64 ) + local_stime_error64 = stime_elapsed64; + + /* + * We require 0 < stime_elapsed < 2^31. + * This allows us to binary shift a 32-bit tsc_elapsed such that: + * stime_elapsed < tsc_elapsed <= 2*stime_elapsed + */ + while ( ((u32)stime_elapsed64 != stime_elapsed64) || + ((s32)stime_elapsed64 < 0) ) { - _usecs -= 1000000; - secs++; + stime_elapsed64 >>= 1; + tsc_elapsed64 >>= 1; + local_stime_error64 >>= 1; } - wc_sec = secs; - wc_usec = _usecs; + /* stime_master_diff (and hence stime_error) now fit in a 32-bit word. */ + stime_elapsed32 = (u32)stime_elapsed64; + local_stime_error32 = (u32)local_stime_error64; - /* Others will pick up the change at the next tick. */ - __update_dom_time(current); - send_guest_virq(current, VIRQ_TIMER); + /* tsc_elapsed <= 2*stime_elapsed */ + while ( tsc_elapsed64 > (stime_elapsed32 * 2) ) + { + tsc_elapsed64 >>= 1; + tsc_shift--; + } - write_unlock_irq(&time_lock); -} + /* Local difference must now fit in 32 bits. */ + ASSERT((u32)tsc_elapsed64 == tsc_elapsed64); + tsc_elapsed32 = (u32)tsc_elapsed64; + /* tsc_elapsed > stime_elapsed */ + ASSERT(tsc_elapsed32 != 0); + while ( tsc_elapsed32 <= stime_elapsed32 ) + { + tsc_elapsed32 <<= 1; + tsc_shift++; + } -/* Late init function (after all CPUs are booted). */ -int __init init_xen_time() +#if 0 + printk("---%d: %08x %d\n", cpu, + div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32), + tsc_shift); +#endif + + /* Record new timestamp information. */ + cpu_time[cpu].tsc_scale.mul_frac = + div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32); + cpu_time[cpu].tsc_scale.shift = tsc_shift; + cpu_time[cpu].local_tsc_stamp = curr_tsc; + cpu_time[cpu].stime_local_stamp = curr_local_stime; + cpu_time[cpu].stime_master_stamp = curr_master_stime; + + set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000)); + + if ( cpu == 0 ) + platform_time_calibration(); +} + +void init_percpu_time(void) { - u64 scale; - unsigned int cpu_ghz; + unsigned int cpu = smp_processor_id(); + unsigned long flags; + s_time_t now; - cpu_ghz = (unsigned int)(cpu_freq / 1000000000ULL); - for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 ) - continue; + local_irq_save(flags); + rdtscll(cpu_time[cpu].local_tsc_stamp); + now = (cpu == 0) ? 0 : read_platform_stime(); + local_irq_restore(flags); + + cpu_time[cpu].stime_master_stamp = now; + cpu_time[cpu].stime_local_stamp = now; - scale = 1000000000LL << (32 + rdtsc_bitshift); - scale /= cpu_freq; - st_scale_f = scale & 0xffffffff; - st_scale_i = scale >> 32; + init_ac_timer(&cpu_time[cpu].calibration_timer, + local_time_calibration, NULL, cpu); + set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000)); +} + +/* Late init function (after all CPUs are booted). */ +int __init init_xen_time(void) +{ + wc_sec = get_cmos_time(); local_irq_disable(); - /* System time ticks from zero. */ - rdtscll(full_tsc_irq); - stime_irq = (s_time_t)0; - shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift); + init_percpu_time(); - /* Wallclock time starts as the initial RTC time. */ - wc_sec = get_cmos_time(); + stime_platform_stamp = 0; + init_platform_timer(); local_irq_enable(); - printk("Time init:\n"); - printk(".... cpu_freq: %08X:%08X\n", (u32)(cpu_freq>>32),(u32)cpu_freq); - printk(".... scale: %08X:%08X\n", (u32)(scale>>32),(u32)scale); - printk(".... Wall Clock: %lds %ldus\n", wc_sec, wc_usec); - return 0; } @@ -375,15 +601,12 @@ int __init init_xen_time() /* Early init function. */ void __init early_time_init(void) { - unsigned long ticks_per_frac = calibrate_tsc(); - - if ( !ticks_per_frac ) - panic("Error calibrating TSC\n"); - - cpu_khz = ticks_per_frac / (1000/CALIBRATE_FRAC); + u64 tmp = calibrate_boot_tsc(); - cpu_freq = (u64)ticks_per_frac * (u64)CALIBRATE_FRAC; + set_time_scale(&cpu_time[0].tsc_scale, tmp); + do_div(tmp, 1000); + cpu_khz = (unsigned long)tmp; printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); diff --git a/xen/arch/x86/vmx_intercept.c b/xen/arch/x86/vmx_intercept.c index 2a8322618a..15b8caf371 100644 --- a/xen/arch/x86/vmx_intercept.c +++ b/xen/arch/x86/vmx_intercept.c @@ -24,10 +24,10 @@ #include #include #include - #include #include #include +#include #ifdef CONFIG_VMX @@ -175,7 +175,7 @@ int intercept_pit_io(ioreq_t *p) p->port_mm) return 0; - if (p->addr == 0x43 && + if (p->addr == PIT_MODE && p->dir == 0 && /* write */ ((p->u.data >> 4) & 0x3) == 0 && /* latch command */ ((p->u.data >> 6) & 0x3) == (vpit->channel)) {/* right channel */ @@ -183,7 +183,7 @@ int intercept_pit_io(ioreq_t *p) return 1; } - if (p->addr == (0x40 + vpit->channel) && + if (p->addr == (PIT_CH0 + vpit->channel) && p->dir == 1) { /* read */ p->u.data = pit_read_io(vpit); resume_pit_io(p); diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c index 8dac1867f4..1bf443ac46 100644 --- a/xen/common/ac_timer.c +++ b/xen/common/ac_timer.c @@ -202,7 +202,7 @@ static void ac_timer_softirq_action(void) do { heap = ac_timers[cpu].heap; now = NOW(); - + while ( (GET_HEAP_SIZE(heap) != 0) && ((t = heap[1])->expires < (now + TIMER_SLOP)) ) { diff --git a/xen/common/domain.c b/xen/common/domain.c index b11ec069fa..1b42a17dca 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -42,8 +42,6 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu) d->domain_id = dom_id; v->processor = cpu; - spin_lock_init(&d->time_lock); - spin_lock_init(&d->big_lock); spin_lock_init(&d->page_alloc_lock); diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 527870de37..e8c70d63b7 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -351,10 +351,10 @@ void free_heap_pages( void scrub_heap_pages(void) { void *p; - unsigned long pfn, flags; + unsigned long pfn; + int cpu = smp_processor_id(); printk("Scrubbing Free RAM: "); - watchdog_disable(); for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ ) { @@ -362,12 +362,15 @@ void scrub_heap_pages(void) if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) printk("."); + if ( unlikely(softirq_pending(cpu)) ) + do_softirq(); + /* Quick lock-free check. */ if ( allocated_in_map(pfn) ) continue; - - spin_lock_irqsave(&heap_lock, flags); - + + spin_lock_irq(&heap_lock); + /* Re-check page status with lock held. */ if ( !allocated_in_map(pfn) ) { @@ -385,11 +388,10 @@ void scrub_heap_pages(void) unmap_domain_page(p); } } - - spin_unlock_irqrestore(&heap_lock, flags); + + spin_unlock_irq(&heap_lock); } - watchdog_enable(); printk("done.\n"); } diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index f8fe1d69e0..84bad115ea 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -635,8 +635,6 @@ static int __init debugtrace_init(void) debugtrace_bytes = bytes; - memset(debugtrace_buf, '\0', debugtrace_bytes); - return 0; } __initcall(debugtrace_init); diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h index 8f48cd31dc..81cd5aae56 100644 --- a/xen/include/asm-x86/time.h +++ b/xen/include/asm-x86/time.h @@ -4,4 +4,7 @@ extern int timer_ack; +extern void calibrate_tsc_bp(void); +extern void calibrate_tsc_ap(void); + #endif /* __X86_TIME_H__ */ diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index 3e25532f19..66ffeb60aa 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -329,6 +329,28 @@ typedef struct vcpu_info { #endif } vcpu_info_t; +typedef struct vcpu_time_info { + /* + * The following values are updated periodically (and not necessarily + * atomically!). The guest OS detects this because 'time_version1' is + * incremented just before updating these values, and 'time_version2' is + * incremented immediately after. See the Xen-specific Linux code for an + * example of how to read these values safely (arch/xen/kernel/time.c). + */ + u32 time_version1; + u32 time_version2; + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_time; /* Time, in nanosecs, since boot. */ + /* + * Current system time: + * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul + * CPU frequency (Hz): + * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift + */ + u32 tsc_to_system_mul; + s8 tsc_shift; +} vcpu_time_info_t; + /* * Xen/kernel shared data -- pointer provided in start_info. * NB. We expect that this struct is smaller than a page. @@ -336,6 +358,8 @@ typedef struct vcpu_info { typedef struct shared_info { vcpu_info_t vcpu_data[MAX_VIRT_CPUS]; + vcpu_time_info_t vcpu_time[MAX_VIRT_CPUS]; + u32 n_vcpu; /* @@ -373,33 +397,11 @@ typedef struct shared_info { u32 evtchn_mask[32]; /* - * Time: The following abstractions are exposed: System Time, Clock Time, - * Domain Virtual Time. Domains can access Cycle counter time directly. + * Wallclock time: updated only by control software. Guests should base + * their gettimeofday() syscall on this wallclock-base value. */ - u64 cpu_freq; /* CPU frequency (Hz). */ - - /* - * The following values are updated periodically (and not necessarily - * atomically!). The guest OS detects this because 'time_version1' is - * incremented just before updating these values, and 'time_version2' is - * incremented immediately after. See the Xen-specific Linux code for an - * example of how to read these values safely (arch/xen/kernel/time.c). - */ - u32 time_version1; - u32 time_version2; - tsc_timestamp_t tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_time; /* Time, in nanosecs, since boot. */ u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ u32 wc_usec; /* Usecs 00:00:00 UTC, Jan 1, 1970. */ - u64 domain_time; /* Domain virtual time, in nanosecs. */ - - /* - * Timeout values: - * Allow a domain to specify a timeout value in system time and - * domain virtual time. - */ - u64 wall_timeout; - u64 domain_timeout; arch_shared_info_t arch; diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 8caebed588..562b3241c6 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -92,7 +92,6 @@ struct domain domid_t domain_id; shared_info_t *shared_info; /* shared data area */ - spinlock_t time_lock; spinlock_t big_lock; diff --git a/xen/include/xen/time.h b/xen/include/xen/time.h index d0091b6a10..88d88039b0 100644 --- a/xen/include/xen/time.h +++ b/xen/include/xen/time.h @@ -30,7 +30,8 @@ #include #include -extern int init_xen_time(); +extern int init_xen_time(void); +extern void init_percpu_time(void); extern unsigned long cpu_khz; -- 2.30.2